{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import math\n",
    "\n",
    "from keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate\n",
    "from keras.models import Model\n",
    "from keras.utils import plot_model\n",
    "\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/warproxxx/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  if __name__ == '__main__':\n"
     ]
    }
   ],
   "source": [
    "bot_accounts = pd.concat([pd.read_csv('data/social_spambots_1.csv'), pd.read_csv('data/social_spambots_2.csv'), pd.read_csv('data/social_spambots_3.csv')]).reset_index(drop=True)\n",
    "clean_accounts = pd.read_csv('data/geniune_accounts.csv')\n",
    "\n",
    "requiredColumns = ['screen_name', 'created_at', 'updated', 'location', 'verified', 'statuses_count', 'friends_count','followers_count', 'favourites_count', 'default_profile_image', 'profile_use_background_image', 'protected', 'default_profile']\n",
    "bot_accounts = bot_accounts[requiredColumns]\n",
    "clean_accounts = clean_accounts[requiredColumns]\n",
    "\n",
    "def clean_df(df):\n",
    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
    "    df['updated'] = pd.to_datetime(df['updated'])\n",
    "    df['age'] = (df['updated'] - df['created_at']).astype('timedelta64[D]').astype(int)\n",
    "    df['has_location'] = df['location'].apply(lambda x: 0 if x==x else 1)\n",
    "    df['has_avatar'] = df['default_profile_image'].apply(lambda x: 1 if x==x else 0)\n",
    "    df['has_background'] = df['profile_use_background_image'].apply(lambda x: 1 if x==x else 0)\n",
    "    df['is_verified']=df['verified'].apply(lambda x: 1 if x==x else 0)\n",
    "    df['is_protected']=df['protected'].apply(lambda x: 1 if x==x else 0)\n",
    "    df['profile_modified'] = df['default_profile'].apply(lambda x: 0 if x==x else 1)\n",
    "    df = df.rename(index=str, columns={\"screen_name\": \"username\", \"statuses_count\": \"total_tweets\", \"friends_count\": \"total_following\", \"followers_count\": \"total_followers\", \"favourites_count\": \"total_likes\"})\n",
    "    return df[['username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified']]\n",
    "\n",
    "bot_accounts = clean_df(bot_accounts)\n",
    "clean_accounts = clean_df(clean_accounts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "bot_accounts['BotOrNot'] = 1\n",
    "clean_accounts['BotOrNot'] = 0\n",
    "\n",
    "combined_df = pd.concat([bot_accounts, clean_accounts])\n",
    "\n",
    "new_df = combined_df.sample(frac=1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "training_df = new_df.drop('username', axis=1)[:int(combined_df.shape[0] * 0.8)]\n",
    "test_df = new_df.drop('username', axis=1)[int(combined_df.shape[0] * 0.8):]\n",
    "\n",
    "columns_to_standardize = ['age', 'total_tweets', 'total_following', 'total_followers', 'total_likes']\n",
    "\n",
    "training_df_mean = training_df[columns_to_standardize].mean()\n",
    "training_df_std = training_df[columns_to_standardize].std()\n",
    "\n",
    "training_df[columns_to_standardize] = (training_df[columns_to_standardize] - training_df_mean)/training_df_std\n",
    "test_df[columns_to_standardize] = (test_df[columns_to_standardize] - training_df_mean)/training_df_std\n",
    "\n",
    "# training_df_mean = training_df.mean()\n",
    "# training_df_std = training_df.std()\n",
    "\n",
    "# training_df = (training_df - training_df_mean)/training_df_std\n",
    "# test_df = (test_df - training_df_mean)/training_df_std\n",
    "\n",
    "# max_vals = training_df.max()\n",
    "\n",
    "# training_df = training_df/max_vals\n",
    "# test_df = test_df/max_vals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = training_df.drop(['BotOrNot', 'is_protected'], axis=1).values\n",
    "y_train = training_df['BotOrNot'].values.reshape(-1,1)\n",
    "\n",
    "X_test = test_df.drop(['BotOrNot', 'is_protected'], axis=1).values\n",
    "y_test = test_df['BotOrNot'].values.reshape(-1,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "inp = Input(shape=[10])\n",
    "\n",
    "another = Dense(500, activation='relu')(inp)\n",
    "another = Dense(200, activation='relu')(another)\n",
    "another = Dense(1, activation='sigmoid')(another)\n",
    "\n",
    "mod = Model(inp, another)\n",
    "mod.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 6708 samples, validate on 1678 samples\n",
      "Epoch 1/20\n",
      "6708/6708 [==============================] - 0s 52us/step - loss: 0.2552 - acc: 0.9044 - val_loss: 0.1675 - val_acc: 0.9565\n",
      "Epoch 2/20\n",
      "6708/6708 [==============================] - 0s 26us/step - loss: 0.1244 - acc: 0.9620 - val_loss: 0.1388 - val_acc: 0.9577\n",
      "Epoch 3/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.1101 - acc: 0.9684 - val_loss: 0.1317 - val_acc: 0.9631\n",
      "Epoch 4/20\n",
      "6708/6708 [==============================] - 0s 26us/step - loss: 0.1028 - acc: 0.9679 - val_loss: 0.1273 - val_acc: 0.9732\n",
      "Epoch 5/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0993 - acc: 0.9678 - val_loss: 0.1211 - val_acc: 0.9702\n",
      "Epoch 6/20\n",
      "6708/6708 [==============================] - 0s 26us/step - loss: 0.0872 - acc: 0.9723 - val_loss: 0.1249 - val_acc: 0.9690\n",
      "Epoch 7/20\n",
      "6708/6708 [==============================] - 0s 26us/step - loss: 0.0821 - acc: 0.9736 - val_loss: 0.1159 - val_acc: 0.9726\n",
      "Epoch 8/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0844 - acc: 0.9747 - val_loss: 0.1174 - val_acc: 0.9744\n",
      "Epoch 9/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0785 - acc: 0.9760 - val_loss: 0.1216 - val_acc: 0.9708\n",
      "Epoch 10/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0774 - acc: 0.9754 - val_loss: 0.1061 - val_acc: 0.9744\n",
      "Epoch 11/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0771 - acc: 0.9775 - val_loss: 0.1039 - val_acc: 0.9750\n",
      "Epoch 12/20\n",
      "6708/6708 [==============================] - 0s 26us/step - loss: 0.0758 - acc: 0.9775 - val_loss: 0.0934 - val_acc: 0.9768\n",
      "Epoch 13/20\n",
      "6708/6708 [==============================] - 0s 28us/step - loss: 0.0725 - acc: 0.9778 - val_loss: 0.0995 - val_acc: 0.9797\n",
      "Epoch 14/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0676 - acc: 0.9791 - val_loss: 0.1034 - val_acc: 0.9750\n",
      "Epoch 15/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0734 - acc: 0.9790 - val_loss: 0.1101 - val_acc: 0.9738\n",
      "Epoch 16/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0705 - acc: 0.9785 - val_loss: 0.0974 - val_acc: 0.9768\n",
      "Epoch 17/20\n",
      "6708/6708 [==============================] - 0s 27us/step - loss: 0.0730 - acc: 0.9788 - val_loss: 0.0912 - val_acc: 0.9791\n",
      "Epoch 18/20\n",
      "6708/6708 [==============================] - 0s 26us/step - loss: 0.0679 - acc: 0.9787 - val_loss: 0.1315 - val_acc: 0.9648\n",
      "Epoch 19/20\n",
      "6708/6708 [==============================] - 0s 26us/step - loss: 0.0655 - acc: 0.9796 - val_loss: 0.0933 - val_acc: 0.9768\n",
      "Epoch 20/20\n",
      "6708/6708 [==============================] - 0s 32us/step - loss: 0.0649 - acc: 0.9800 - val_loss: 0.0900 - val_acc: 0.9785\n"
     ]
    }
   ],
   "source": [
    "training = mod.fit(x=X_train, y=y_train, batch_size=64, epochs=20, validation_data=(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mod.save('my_model.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:anaconda3]",
   "language": "python",
   "name": "conda-env-anaconda3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
